Overview

R Markdown setup

Packages

# Install 'pak' if not already installed
if (!requireNamespace("pak", quietly = TRUE)) {
  install.packages("pak")
}

# Load 'pak' library
library(pak)

# Install the required packages using 'pak'
pak::pak(c(
  "tidyverse",
  "Seurat@4.3.0",
  "SCpubr",
  "scater",
  "org.Mm.eg.db",
  "clusterProfiler"
))
## ℹ Loading metadata database✔ Loading metadata database ... done
##  
## ℹ No downloads are needed
## ✔ 6 pkgs + 246 deps: kept 162 [14.4s]
# Suppress messages while loading the libraries
suppressMessages(library(tidyverse))
suppressMessages(library(Seurat))
suppressMessages(library(SCpubr))
suppressMessages(library(scater))
suppressMessages(library(org.Mm.eg.db))
suppressMessages(library(clusterProfiler))

1. Data Import

# WT
wt1 <- Read10X(
  data.dir = "../../read/scRNA-Seq/WTAC_SCRNA8012500/filtered_feature_bc_matrix/"
)
# wt2 <- Read10X(data.dir = "../../course_data//scRNA-Seq Module/WTAC_SCRNA8012501/filtered_feature_bc_matrix/")

# Scrambled
scr1 <- Read10X(
  data.dir = "../../read/scRNA-Seq/WTAC_SCRNA8012504/filtered_feature_bc_matrix/"
)
scr2 <- Read10X(
  data.dir = "../../read/scRNA-Seq/WTAC_SCRNA8012505/filtered_feature_bc_matrix/"
)

Merging samples

  • Here, we are merge the three samples (WT1, SCR1, SCR2) into one expression matrix. We add the sample of origin in each of the cells, so that later we can retrieve the information.
# WT
w1 <- as.matrix(wt1)
colnames(w1) <- paste(colnames(w1),"w1", sep = "_")
# w2 <- as.matrix(wt2)
# colnames(w2) <- paste(colnames(w2),"w2", sep = "_")

# Scrambled
s1 <- as.matrix(scr1)
colnames(s1) <- paste(colnames(s1),"s1", sep = "_")
s2 <- as.matrix(scr2)
colnames(s2) <- paste(colnames(s2),"s2", sep = "_")

# cbind
x <- cbind(w1, s1, s2)

# we remove these file to save memory.
rm(w1, wt1, s1, scr1, s2, scr2)

2. Seurat Object Instantiation

# Seurat Object Instantiation
seu <- CreateSeuratObject(counts = x, project = "RNA-transcriptomics")

# Batch & Condition Annotation
seu$batch <- str_split(string = colnames(seu),pattern = "_",simplify = T)[,2]
seu$condition <- ifelse(seu$batch == "w1",yes = "WT",no = "SCRAMBLED")

# Mitochondrial Genes Percentage Quantification
seu[["percent.mt"]] <- PercentageFeatureSet(seu, pattern = "^mt-")

# Seurat Object Metadata Exploration
view(seu@meta.data)

3. QC

3.1 Cell QC

  • Single cell QC is done using 3 core metrics:

    • Library Size/Total Counts: Total number of counts per cell for all the features.

    • Total Features: Total number of unique non-zero features per cell.

    • Mitochondrial Genes Percentage: Percentage of a cell’s total counts that is dominated by mitochondrial genes.

  • For library size and total features, the aim is to remove cells with very low or very high values; the underlying assumption here is that cells with low counts and features represent broken droplets that have been broken and have thus failed to capture a cell. On the other hand, cells with high counts and features probably represent doublets or multiplets.

    • There are specific R/Python packages that can be used to detect doublets, e.g. Scrublet or DoubletFinder.
  • For mitochondrial genes percentage, mitochondrial gene expression should not be detected in single cell data unless there is significant mitochondrial permeation and leakage of mitochondrial genes into the cytoplasm, which is indicative of cell stress and cell death (both apoptotic and necrotic). Cells with high percentages of mitochondrial genes in their counts are probably stressed cells that are not representative of the biological variation of interest and are best removed from the data.

  • Defining threshold values for QC is done empirically after inspecting the distributions of total counts and total features; this is best done using key visualisations, namely histograms, violin plots, and scatter plots.

    • Here, we can use the Median Absolute Deviation (MAD) for guidance.

3.1.1. Library Size

# Violin Plot
VlnPlot(object = seu,
        features = "nCount_RNA",
        group.by = "condition") + labs(title = "Library Size")

ggsave(filename = "write/figures/Library Size Violin Plot.png",
       height = 10,
       width = 10)

# MAD Thresholds
outlier_counts <- isOutlier(metric = seu@meta.data$nCount_RNA,
          nmads = 3,
          log = T)
counts_thresholds <- attr(outlier_counts, "thresholds")

# Histogram
seu@meta.data %>% ggplot(aes(x = nCount_RNA)) + 
  geom_histogram(aes(fill = condition)) + labs(x = "Library Size", 
                                               y = "Frequency",
                                               title = "Library Size Histogram") + 
  facet_grid(condition ~ .) + geom_vline(xintercept = counts_thresholds)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave(filename = "write/figures/Library Size Histogram.png",
       height = 10,
       width = 10)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

3.1.2. Total features

# Violin Plot
VlnPlot(object = seu,
        features = "nFeature_RNA",
        group.by = "condition") + labs(title = "Total Features")

ggsave(filename = "write/figures/Total Features Violin Plot.png",
       height = 10,
       width = 10)

# MAD Thresholds
outlier_features <- isOutlier(metric = seu@meta.data$nFeature_RNA,
          nmads = 3,
          log = T)
features_thresholds <- attr(outlier_features, "thresholds")

# Histogram
seu@meta.data %>% ggplot(aes(x = nFeature_RNA)) + 
  geom_histogram(aes(fill = condition)) + labs(x = "Total Features", 
                                               y = "Frequency",
                                               title = "Total Features Histogram") +
  facet_grid(condition ~ .) + geom_vline(xintercept = features_thresholds)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave(filename = "write/figures/Total Features Histogram.png",
       height = 10,
       width = 10)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

3.1.3. Percentage Mitochondrial Genes

# Violin Plot
VlnPlot(object = seu,
        features = "percent.mt",
        group.by = "condition") + labs(title = "% Mitochondrial Genes")

ggsave(filename = "write/figures/Percentage Mitochondrial Genes Violin Plot.png",
       height = 10,
       width = 10)

# Histogram
seu@meta.data %>% ggplot(aes(x = percent.mt)) + 
  geom_histogram(aes(fill = condition)) + labs(x = "% Mitochondrial Genes", 
                                               y = "Frequency",
                                               title = "% Mitocondrial Features Histogram") +
  facet_grid(condition ~ .) + geom_vline(xintercept = 15)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave(filename = "write/figures/Percentage Mitochondrial Genes Histogram.png",
       height = 10,
       width = 10)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

3.1.3. Combined Scatter Plot

  • Setting the thresholds above for each QC metric in isolation is a valid approach, but it can result in loss of important biological variation if the different QC metrics are not visualized together to observe the population of cells that is filtered out by the independent thresholds.

  • The scatter plot below has library size on the x-axis, total number of detected features on the y-axis, and color of the points as % mitochondrial genes with the thresholds super-imposed on top.

  • From the figure, it becomes clear that the thresholds set are rather conservative, and the bulk of the data is presered with only outliers being removed.

# ScatterPlot
seu@meta.data %>%
  ggplot(aes(nCount_RNA,
             nFeature_RNA,
             colour= percent.mt)) +
  geom_point() +
  labs(title = "QC Metrics Combined", 
       x = "Library Size", 
       y = "Detected Features", 
       color = "% Mitochondrial Genes") + 
  scale_color_viridis_c() + 
  facet_grid(condition ~ .) + geom_vline(xintercept = counts_thresholds) + geom_hline(yintercept = features_thresholds)

ggsave(filename = "write/figures/QC Metrics Scatter Plot.png",
       height = 15,
       width = 10)

3.1.4. Cell Filtration

  • Here, we apply the three cell filters. This results in the removal of 455 cells, which is good enough and indicates that the thresholds being set are not too stringent on the data.

  • The final step here is the removal of the seu object we have been using thusfar for memory conservation. In this course, the data size is not substantial and so we can get away with keeping the object; however, when dealing with larger datasets, memory constraints become a substantial

# Classical QC
ncol(seu) #2474
## [1] 2474
seu_subset <- subset(seu, subset = (nCount_RNA > counts_thresholds[1]) & (nCount_RNA < counts_thresholds[2]) & (nFeature_RNA > features_thresholds[1]) & (nFeature_RNA < features_thresholds[2]) & (percent.mt < 15))
ncol(seu_subset) # 2019
## [1] 2019
# Saving memory
rm(seu)

3.2. Genes QC

  • This filtration step will remove all features that do not have non-zero accounts in at least 3 cells.

  • This filtration step is essential since lowly-expressed genes can come up as variable genes in subsequent steps and will skew the results of the analysis if not removed early.

nrow(seu_subset) # 31053
## [1] 31053
filter3 <- function(seurat){
  #Threshold
  threshold <- 3
  #Raw Counts
  counts <- seurat@assays$RNA@counts
  rawcounts <- as.matrix(counts)
  #Number of cells where expression is greater than zero for each gene
  sums <- rowSums(rawcounts > 0)
  #Genes that are expressed in a number of cells greater than threshold
  sums_2 <- sums[sums > threshold]
  #Filtering and dimensions of Seurat object before and after
  seurat <- seurat[names(sums_2),]
  return(seurat)
} # function form of filter 3
seu_subset <- filter3(seurat = seu_subset)
nrow(seu_subset) # 16229
## [1] 16229
  • Note: It’s important to filter first the cells and then the genes.

4. Cell Cycle Scoring & Assignment

# Normalisation, Variable Feature Selection, Data Scaling
seu_subset <- NormalizeData(seu_subset)
seu_subset <- FindVariableFeatures(seu_subset)
seu_subset <- ScaleData(seu_subset, features = rownames(seu_subset))
## Centering and scaling data matrix
# Cell Cycle Features (Mouse Version)
cc.genes.mouse <- readRDS(file = "../../read/scRNA-Seq/cc.genes.mouse.rds")

# Cell Cycle Scoring
seu_subset <- CellCycleScoring(seu_subset, 
                        s.features = cc.genes.mouse$s.genes, 
                        g2m.features = cc.genes.mouse$g2m.genes, 
                        set.ident = TRUE)
table(seu_subset$Phase)
## 
##  G1 G2M   S 
## 925 497 597

5. SCTransform

# SCTransform
seu_subset <- suppressWarnings(SCTransform(object = seu_subset, variable.features.n = 3000))
## Calculating cell attributes from input UMI matrix: log_umi
## Variance stabilizing transformation of count matrix of size 15822 by 2019
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 2019 cells
##   |                                                                              |                                                                      |   0%  |                                                                              |==================                                                    |  25%  |                                                                              |===================================                                   |  50%  |                                                                              |====================================================                  |  75%  |                                                                              |======================================================================| 100%
## Found 80 outliers - those will be ignored in fitting/regularization step
## Second step: Get residuals using fitted parameters for 15822 genes
##   |                                                                              |                                                                      |   0%  |                                                                              |==                                                                    |   3%  |                                                                              |====                                                                  |   6%  |                                                                              |=======                                                               |   9%  |                                                                              |=========                                                             |  12%  |                                                                              |===========                                                           |  16%  |                                                                              |=============                                                         |  19%  |                                                                              |===============                                                       |  22%  |                                                                              |==================                                                    |  25%  |                                                                              |====================                                                  |  28%  |                                                                              |======================                                                |  31%  |                                                                              |========================                                              |  34%  |                                                                              |==========================                                            |  38%  |                                                                              |============================                                          |  41%  |                                                                              |===============================                                       |  44%  |                                                                              |=================================                                     |  47%  |                                                                              |===================================                                   |  50%  |                                                                              |=====================================                                 |  53%  |                                                                              |=======================================                               |  56%  |                                                                              |==========================================                            |  59%  |                                                                              |============================================                          |  62%  |                                                                              |==============================================                        |  66%  |                                                                              |================================================                      |  69%  |                                                                              |==================================================                    |  72%  |                                                                              |====================================================                  |  75%  |                                                                              |=======================================================               |  78%  |                                                                              |=========================================================             |  81%  |                                                                              |===========================================================           |  84%  |                                                                              |=============================================================         |  88%  |                                                                              |===============================================================       |  91%  |                                                                              |==================================================================    |  94%  |                                                                              |====================================================================  |  97%  |                                                                              |======================================================================| 100%
## Computing corrected count matrix for 15822 genes
##   |                                                                              |                                                                      |   0%  |                                                                              |==                                                                    |   3%  |                                                                              |====                                                                  |   6%  |                                                                              |=======                                                               |   9%  |                                                                              |=========                                                             |  12%  |                                                                              |===========                                                           |  16%  |                                                                              |=============                                                         |  19%  |                                                                              |===============                                                       |  22%  |                                                                              |==================                                                    |  25%  |                                                                              |====================                                                  |  28%  |                                                                              |======================                                                |  31%  |                                                                              |========================                                              |  34%  |                                                                              |==========================                                            |  38%  |                                                                              |============================                                          |  41%  |                                                                              |===============================                                       |  44%  |                                                                              |=================================                                     |  47%  |                                                                              |===================================                                   |  50%  |                                                                              |=====================================                                 |  53%  |                                                                              |=======================================                               |  56%  |                                                                              |==========================================                            |  59%  |                                                                              |============================================                          |  62%  |                                                                              |==============================================                        |  66%  |                                                                              |================================================                      |  69%  |                                                                              |==================================================                    |  72%  |                                                                              |====================================================                  |  75%  |                                                                              |=======================================================               |  78%  |                                                                              |=========================================================             |  81%  |                                                                              |===========================================================           |  84%  |                                                                              |=============================================================         |  88%  |                                                                              |===============================================================       |  91%  |                                                                              |==================================================================    |  94%  |                                                                              |====================================================================  |  97%  |                                                                              |======================================================================| 100%
## Calculating gene attributes
## Wall clock passed: Time difference of 20.80043 secs
## Determine variable features
## Place corrected count matrix in counts slot
## Centering data matrix
## Set default assay to SCT

Notice at this point we have two assays in our data. The active assay is SCT (SCTransform-Normalized data).

# Available Assays
seu_subset@assays
## $RNA
## Assay data with 16229 features for 2019 cells
## Top 10 variable features:
##  Dcpp1, Trap1a, Apoe, Nnat, Emcn, Hes5, Calb2, Dcpp2, Ube2c, Car3 
## 
## $SCT
## SCTAssay data with 15822 features for 2019 cells, and 1 SCTModel(s) 
## Top 10 variable features:
##  Hist1h2ap, Trap1a, Dcpp1, Ube2c, Cenpf, Malat1, Hmgb2, Apoe, Cdc20,
## Top2a
# Active Assay
seu_subset@active.assay
## [1] "SCT"

6. Dimensionality Reduction

6.1. Principal Component Analysis (PCA)

  • The first dimensionality reduction step is PCA. PCA will help us understand the primary sources of variation in the data (be it biological or technical), and is essential for subsequent UMAP dimensionality reduction.

  • PCA is also a good QC step that might point to systematic biases.

# PCA
seu_subset <- RunPCA(seu_subset)
## PC_ 1 
## Positive:  Hmgb2, Top2a, Mki67, Birc5, Pclaf, H2afx, Ube2c, Cdca8, Cdk1, Cenpf 
##     Cks2, Ccnb1, Tpx2, H2afz, Ccna2, Smc2, Nusap1, Cdc20, Hist1h2ap, Prc1 
##     Plk1, Tuba1b, Tubb4b, Spc25, Smc4, Cenpe, Cenpa, Hmmr, Incenp, Racgap1 
## Negative:  Ccnd2, Serpine2, Mxd4, Fos, Sparc, Ptn, Junb, Hist1h2bc, Ccnd1, Vim 
##     Nptx1, Tmem176a, Olig1, Gstm1, Igfbp4, Rhoq, Tmem176b, 1500015O10Rik, Ckb, mt-Nd4 
##     Hotairm1, mt-Co3, Cd81, Gpm6b, Ier2, Tnfrsf21, Igfbp2, Hoxb8, Epha5, Itm2b 
## PC_ 2 
## Positive:  Hist1h2ap, Mcm3, Rrm2, Ung, Hells, Mcm5, Mcm6, Chaf1b, Rpa2, Nop56 
##     Nasp, Mcm7, Atad2, Tipin, Srm, Slbp, Rfc3, Srsf7, Mcm2, Mcm4 
##     Dtl, Lig1, Cdt1, Nop58, Uhrf1, Npm1, Clspn, Apln, Ccne1, Cdc6 
## Negative:  Arl6ip1, Cenpa, Cdc20, Ccnb2, Tubb4b, Pttg1, Cenpf, Ccnb1, Cdca3, Ptn 
##     Zbtb20, Fos, Hmmr, Plk1, Tpx2, Ckap2, Cdkn3, Aspm, Ube2c, Apoe 
##     Cdca8, Cenpe, Cst3, Cks2, Sparc, Birc5, Fth1, Slc1a3, Tacc3, Knstrn 
## PC_ 3 
## Positive:  Hist1h2ap, Tmem176a, Ptn, Lig1, Fth1, Rrm2, Tmem176b, Igfbp4, Atad2, Tmsb4x 
##     Tk1, Mcm3, Malat1, Top2a, Hist1h1b, Chaf1b, Igfbp2, Mcm5, Serpine2, Rpa2 
##     Mcm6, Dek, Clspn, Selenop, Gmnn, Fabp5, Hells, Rhoc, Tipin, Slbp 
## Negative:  Cenpa, Ccnb2, Arl6ip1, H2afz, Hes6, Sox4, Cdc20, Tubb2b, Tuba1a, Cdca3 
##     Pttg1, Hnrnpa1, Olig2, Nrgn, Tubb4b, Cdkn3, Btg2, Ckb, Rps27l, Eef1a1 
##     Hoxa7, Tacc3, Sall3, Bex2, Knstrn, Hsp90ab1, Cd24a, Ppp1r14b, Hmgn2, Ccnb1 
## PC_ 4 
## Positive:  Trap1a, Gstp1, Angpt2, Mageb16, Ifitm3, Emcn, Actg1, Tmsb4x, Nnat, Fkbp10 
##     H13, Chl1, Nap1l5, Fabp5, Slc50a1, Chsy1, Sh3pxd2a, Cenpa, Tuba1a, Pou3f1 
##     Lyn, Camk2n1, Tmc3, Tmem35a, Palmd, 8030474K03Rik, Igf2r, Ajap1, Akap12, Serpinh1 
## Negative:  Hoxb8, Hist1h2ap, Hoxa7, Egr1, Hoxb6, Fos, Malat1, Ier2, Epha5, Top2a 
##     Cdkn2a, Igfbp5, Hoxa9, Junb, Cdkn1a, Btg2, Ckb, Sox4, Mt3, Cyr61 
##     Hoxa10, Hoxb9, Serpine2, Spon1, Nrgn, mt-Nd4, 1500015O10Rik, Lrrn1, Cd81, Hist1h1b 
## PC_ 5 
## Positive:  Ccnd1, Nptx1, Pou3f1, Akap12, Igfbp4, Serpine2, Ier3, Nav1, Pttg1, Cenpa 
##     Tnfrsf12a, Rap1b, Spry4, Apln, Odc1, Rhoq, Fgf1, Ccnd2, Arl6ip1, Plk2 
##     Ppp1r2, Tubb4b, Cdc20, Camkk2, Amotl1, Hoxb6, Hmga2, Pdgfa, Chn2, Klf9 
## Negative:  Sox4, Fabp7, Nap1l5, Trap1a, Gstp1, Btg2, Tox3, Tmsb4x, Igfbp5, Sox2 
##     Rgma, Nr2f1, Id3, Ckb, Sox21, Prss23, Ftl1-ps1, Mageb16, Emcn, Myo10 
##     Fabp5, Hist1h2ap, Hes6, Adcyap1r1, Peg3, Notch1, Sesn3, Ifitm3, Btg1, Hist1h1b
# Visualisation
do_DimPlot(seu_subset, group.by = "Phase", pt.size = 1.5, plot.title = "Phase")

ggsave(filename = "write/figures/Cell Cycle Phase PCA.png",
       height = 10,
       width = 15)

do_DimPlot(seu_subset, group.by = "batch", pt.size = 1.5, plot.title = "Batch")

ggsave(filename = "write/figures/Batch PCA.png",
       height = 10,
       width = 15)

do_DimPlot(seu_subset, group.by = "condition", pt.size = 1.5, plot.title = "Condition")

ggsave(filename = "write/figures/Condition PCA.png",
       height = 10,
       width = 15)

6.1.1. Checking for Technical Variation

  • As already mentioned, PCA is an excellent QC step in and within itself because we could visualise potential technical sources of variation in the data, which we would then need to regress out to uncover the primary drivers of biological variation.

  • Here, we will be plotting library size, total number of features, percentage mitochondrial genes, and batch against PCs 1-2 to see if these technical factors are a significant driver of the variation in the data. We will also be plotting cell cycle phase as well as our experimental condition of interest.

# Technical Factors
## Library Size
do_FeaturePlot(seu_subset, 
        reduction = "pca", 
        features = "nCount_RNA", 
        dims = c(1,2), 
        use_viridis = T,
        viridis.palette = "B",
        viridis.direction = -1, 
        pt.size = 1.5,
        plot.title = "Library Size")

ggsave(filename = "write/figures/Library Size PCA.png",
       height = 10,
       width = 15)

## Total Number of Features
do_FeaturePlot(seu_subset, 
        reduction = "pca", 
        features = "nFeature_RNA", 
        dims = c(1,2),
        use_viridis = T,
        viridis.palette = "B",
        viridis.direction = -1,
        pt.size = 1.5, 
        plot.title = "Total Features")

ggsave(filename = "write/figures/Total Features PCA.png",
       height = 10,
       width = 15)

## Percentage MT
do_FeaturePlot(seu_subset, 
        reduction = "pca", 
        features = "percent.mt", 
        dims = c(1,2),
        use_viridis = T,
        viridis.palette = "B",
        viridis.direction = -1, 
        pt.size = 1.5,
        plot.title = "% Mitochondrial Genes")

ggsave(filename = "write/figures/Percentage Mitochondrial Genes PCA.png",
       height = 10,
       width = 15)
  • From the above figures, we notice the following:

    • Technical factors are not the primary sources of variation in the data.

    • There is a strong cell cycle effect along PC1.

6.1.2. Condition of Interest

  • Here, we check for the PCs along which the primary source of variation is our condition of interest. In this case, it seems to be the case that our condition of interest is the primary source of variation along PC6.

  • This result is not surprising given the dominant effect of the cell cycle on the variation in the data, so that is why the next section will take care of cell cycle regression before proceeding with subsequent steps in the analysis.

## Condition
do_DimPlot(seu_subset, 
        reduction = "pca", 
        dims = c(1,2), 
        group.by = "condition")

do_DimPlot(seu_subset, 
        reduction = "pca", 
        dims = c(2,3), 
        group.by = "condition")

do_DimPlot(seu_subset, 
        reduction = "pca", 
        dims = c(3,4), 
        group.by = "condition")

do_DimPlot(seu_subset, 
        reduction = "pca", 
        dims = c(4, 5), 
        group.by = "condition")

do_DimPlot(seu_subset, 
        reduction = "pca", 
        dims = c(5,6), 
        group.by = "condition")

6.1.3. Cell Cycle Regression

  • Given that the focus of this analysis is not cell cycle-driven variation, we will regress out cell cycle variation using SCTransform(), as is shown in the code below. We then run PCA on the updated count matrices and re-do the visualisations to ensure that cell cycle variation is no longer a primary driver.
# SCTransform (Cell Cycle Regression)
seu_subset <- suppressWarnings(SCTransform(object = seu_subset, 
                                           variable.features.n = 3000, 
                                           vars.to.regress = "Phase"))
## Calculating cell attributes from input UMI matrix: log_umi
## Variance stabilizing transformation of count matrix of size 15822 by 2019
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 2019 cells
##   |                                                                              |                                                                      |   0%  |                                                                              |==================                                                    |  25%  |                                                                              |===================================                                   |  50%  |                                                                              |====================================================                  |  75%  |                                                                              |======================================================================| 100%
## Found 80 outliers - those will be ignored in fitting/regularization step
## Second step: Get residuals using fitted parameters for 15822 genes
##   |                                                                              |                                                                      |   0%  |                                                                              |==                                                                    |   3%  |                                                                              |====                                                                  |   6%  |                                                                              |=======                                                               |   9%  |                                                                              |=========                                                             |  12%  |                                                                              |===========                                                           |  16%  |                                                                              |=============                                                         |  19%  |                                                                              |===============                                                       |  22%  |                                                                              |==================                                                    |  25%  |                                                                              |====================                                                  |  28%  |                                                                              |======================                                                |  31%  |                                                                              |========================                                              |  34%  |                                                                              |==========================                                            |  38%  |                                                                              |============================                                          |  41%  |                                                                              |===============================                                       |  44%  |                                                                              |=================================                                     |  47%  |                                                                              |===================================                                   |  50%  |                                                                              |=====================================                                 |  53%  |                                                                              |=======================================                               |  56%  |                                                                              |==========================================                            |  59%  |                                                                              |============================================                          |  62%  |                                                                              |==============================================                        |  66%  |                                                                              |================================================                      |  69%  |                                                                              |==================================================                    |  72%  |                                                                              |====================================================                  |  75%  |                                                                              |=======================================================               |  78%  |                                                                              |=========================================================             |  81%  |                                                                              |===========================================================           |  84%  |                                                                              |=============================================================         |  88%  |                                                                              |===============================================================       |  91%  |                                                                              |==================================================================    |  94%  |                                                                              |====================================================================  |  97%  |                                                                              |======================================================================| 100%
## Computing corrected count matrix for 15822 genes
##   |                                                                              |                                                                      |   0%  |                                                                              |==                                                                    |   3%  |                                                                              |====                                                                  |   6%  |                                                                              |=======                                                               |   9%  |                                                                              |=========                                                             |  12%  |                                                                              |===========                                                           |  16%  |                                                                              |=============                                                         |  19%  |                                                                              |===============                                                       |  22%  |                                                                              |==================                                                    |  25%  |                                                                              |====================                                                  |  28%  |                                                                              |======================                                                |  31%  |                                                                              |========================                                              |  34%  |                                                                              |==========================                                            |  38%  |                                                                              |============================                                          |  41%  |                                                                              |===============================                                       |  44%  |                                                                              |=================================                                     |  47%  |                                                                              |===================================                                   |  50%  |                                                                              |=====================================                                 |  53%  |                                                                              |=======================================                               |  56%  |                                                                              |==========================================                            |  59%  |                                                                              |============================================                          |  62%  |                                                                              |==============================================                        |  66%  |                                                                              |================================================                      |  69%  |                                                                              |==================================================                    |  72%  |                                                                              |====================================================                  |  75%  |                                                                              |=======================================================               |  78%  |                                                                              |=========================================================             |  81%  |                                                                              |===========================================================           |  84%  |                                                                              |=============================================================         |  88%  |                                                                              |===============================================================       |  91%  |                                                                              |==================================================================    |  94%  |                                                                              |====================================================================  |  97%  |                                                                              |======================================================================| 100%
## Calculating gene attributes
## Wall clock passed: Time difference of 19.72644 secs
## Determine variable features
## Place corrected count matrix in counts slot
## Regressing out Phase
## Centering data matrix
## Set default assay to SCT
# PCA
seu_subset <- RunPCA(seu_subset, 
              features = VariableFeatures(object = seu_subset), 
              verbose = FALSE)
  • After regression of the cell cycle above, the subsequent visualisations show the following:

    • PC1 is no longer driven by cell cycle anymore.

    • PC5 is now driven by our experimental condition.

# Visualisation
do_DimPlot(seu_subset, 
           reduction = "pca", 
           dims = c(1,2), 
           group.by = "Phase", 
           pt.size = 1.5)

ggsave(filename = "write/figures/Cell Cycle Phase PCA (After Cell Cycle Regression).png",
       height = 10,
       width = 15)

do_DimPlot(seu_subset, 
        reduction = "pca", 
        dims = c(4,5), 
        group.by = "condition", 
        pt.size = 1.5)

ggsave(filename = "write/figures/Condition PCA (After Cell Cycle Regression).png",
       height = 10,
       width = 15)

6.1.4. Elbow Plot

  • The next step in the pipeline is non-linear dimensionality reduction in the form of UMAP. UMAP uses PCs as its features.

  • To define the optimal number of PCs to include for UMAP, an elbow plot is used to identify the threshold for PCA beyond which PCs are no longer representing any significant variation and are likely dominated by technical variance.

# Elbow Plot
ElbowPlot(seu_subset, 
          ndims = 40)

pcs_use <- 1:20

6.2. UMAP

6.2.1. UMAP Run

# UMAP
seu_subset <- RunUMAP(seu_subset, 
                      dims = pcs_use)
## 09:54:21 UMAP embedding parameters a = 0.9922 b = 1.112
## 09:54:21 Read 2019 rows and found 20 numeric columns
## 09:54:21 Using Annoy for neighbor search, n_neighbors = 30
## 09:54:21 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 09:54:21 Writing NN index file to temp file /var/folders/vc/clz2nb_n7g1c4k13hs8fl3l00000gn/T//RtmpWvqBaX/file766065be2476
## 09:54:21 Searching Annoy index using 1 thread, search_k = 3000
## 09:54:22 Annoy recall = 100%
## 09:54:23 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 09:54:24 Initializing from normalized Laplacian + noise (using irlba)
## 09:54:24 Commencing optimization for 500 epochs, with 77554 positive edges
## 09:54:27 Optimization finished

6.2.2. UMAP Results Visualisation

# Visualisation
## Experimental Condition
do_DimPlot(seu_subset, 
           reduction = "umap", 
           group.by = "condition", 
           pt.size = 1.5)

ggsave(filename = "write/figures/Condition UMAP.png",
       height = 10,
       width = 15)

## Cell Cycle Phase
do_DimPlot(seu_subset, 
           reduction = "umap", 
           group.by = "Phase", 
           pt.size = 1.5)

ggsave(filename = "write/figures/Cell Cycle Phase UMAP.png",
       height = 10,
       width = 15)

7. Clustering

7.1. Louvain Clustering (Multiple Resolutions)

  • The next step in the analysis pipeline is clustering analysis; clustering helps identify clusters in the data that correspond to different forms of variation, and they could then be used as the basis for differential expression analysis to understand the biology underlying the variation in the data.

  • Seurat uses community detection algorithms for clustering, namely louvain and leiden. For more information on these algorithms, please check here.

  • Clustering below relies on PCA, and the top 20 PCs are again the input to the clustering algorithm. The louvain algorithm requires a resolution parameter, which is essential to fine-tune the extent of clustering in the data; low resolution values result in under-clustering, and high values result in over-clustering.

    • To remedy this issue, we compute clustering results using different resolution values ranging from 0 to 1, and we visualise these results using clustering trees. Clustering trees will be explained in the next section
for(i in seq(0,1,0.1)){
seu_subset <- seu_subset %>%
  FindNeighbors(reduction = "pca",
                dims = 1:20,
                verbose = F) %>%
  FindClusters(resolution = i,
               verbose = F)
}

7.2. Clustering Trees

  • Clustering analysis is used in many contexts to group similar samples. One problem when conducting this kind of analysis is how many clusters to use. This is usually controlled by a parameter provided to the clustering algorithm, such as \(k\) for \(k\)-means clustering.

  • Statistics designed to help you make this choice typically either compare two clusterings or score a single clustering. A clustering tree is different in that it visualizes the relationships between at a range of resolutions.

  • To build a clustering tree we need to look at how cells move as the clustering resolution is increased. Each cluster forms a node in the tree and edges are constructed by considering the cells in a cluster at a lower resolution (say \(k = 2\)) that end up in a cluster at the next highest resolution (say \(k = 3\)).

  • By connecting clusters in this way we can see how clusters are related to each other, which are clearly distinct and which are unstable.

  • Extra information about the cells in each node can also be overlaid in order to help make the decision about which resolution to use. For more information about clustering trees please refer to the main publication here.

  • Based on the results of the clustering trees, the final clustering resolution we are going with is 0.2, which gives us a total of 3 clusters.

#Clustering Tree (with resolutions)
c1 <- clustree::clustree(x = seu_subset) +
  guides(edge_colour = FALSE, edge_alpha = FALSE) +
  theme(legend.position = "bottom")

# Clustering Tree (with SC3 Stability Index)
c2 <- clustree::clustree(
  x = seu_subset,
  node_colour = "sc3_stability"
) +
  guides(edge_colour = FALSE, edge_alpha = FALSE) +
  theme(legend.position = "bottom") +
  scale_color_viridis_c(option = "B")
c1 + c2

ggsave("write/figures/Clustrees.png",
  height = 10,
  width = 15
)

# Final Clustering Resolution
seu_subset <- seu_subset %>%
  FindNeighbors(
    reduction = "pca",
    dims = 1:20,
    verbose = F
  ) %>%
  FindClusters(
    resolution = 0.2,
    verbose = F
  )

7.2. Clustering Results Visualisation

  • A comparison of the condition and clusters clearly shows the following:

    • Clusters 1 and 2 are WT clusters.

    • Cluster 0 is a SCRAMBLED cluster.

p1 <- do_DimPlot(seu_subset,
  reduction = "umap",
  group.by = "seurat_clusters",
  label = T,
  pt.size = 1.5
)
ggsave(
  filename = "write/figures/Seurat Clusters UMAP.png",
  plot = p1,
  height = 10,
  width = 10
)

p2 <- do_DimPlot(seu_subset,
  reduction = "umap",
  group.by = "condition",
  label = T,
  pt.size = 1.5
)
p1 + p2

8. Feature Plots & Violin Plots

# Feature Plots
do_FeaturePlot(
  sample = seu_subset,
  features = c("Fos", "Gria3", "Junb", "Trap1a", "Gstp1", "Nap1l5"),
  reduction = "umap",
  ncol = 3,
  viridis.direction = -1,
  viridis.palette = "B",
  use_viridis = T,
  pt.size = 1.5
)

ggsave(
  filename = "write/figures/Features_FeaturePlots.png",
  height = 15,
  width = 20
)
# Violin Plots
VlnPlot(seu_subset,
  features = c("Fos", "Gria3", "Junb", "Trap1a", "Gstp1", "Nap1l5"),
  ncol = 3,
  group.by = "condition",
  pt.size = 1,
  assay = "RNA"
)

ggsave(
  filename = "write/figures/Features_ViolinPlots.png",
  height = 15,
  width = 20
)

9. Differential Expression Analysis (DEA)

# Cluster Markers
ClusterMarkers <- FindAllMarkers(
  object = seu_subset,
  logfc.threshold = 0.25,
  min.pct = 0.25
) %>% dplyr::filter(p_val_adj < 0.05)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
write_csv(
  x = ClusterMarkers,
  file = "write/tables/ClusterMarkers.csv"
)

# Condition Markers
ConditionMarkers <- FindMarkers(
  object = seu_subset,
  ident.1 = "SCRAMBLED",
  ident.2 = "WT",
  group.by = "condition",
  logfc.threshold = 0.25,
  min.pct = 0.25
) %>% dplyr::filter(p_val_adj < 0.05)
write_csv(
  x = ConditionMarkers,
  file = "write/tables/ConditionMarkers.csv"
)

Session Information

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-apple-darwin20
## Running under: macOS 15.1.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Africa/Cairo
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] clusterProfiler_4.14.4      org.Mm.eg.db_3.20.0        
##  [3] AnnotationDbi_1.68.0        scater_1.34.0              
##  [5] scuttle_1.14.0              SingleCellExperiment_1.26.0
##  [7] SummarizedExperiment_1.34.0 Biobase_2.64.0             
##  [9] GenomicRanges_1.56.1        GenomeInfoDb_1.40.1        
## [11] IRanges_2.38.1              S4Vectors_0.42.1           
## [13] BiocGenerics_0.50.0         MatrixGenerics_1.16.0      
## [15] matrixStats_1.4.1           SCpubr_2.0.2               
## [17] SeuratObject_5.0.2          Seurat_4.3.0               
## [19] lubridate_1.9.3             forcats_1.0.0              
## [21] stringr_1.5.1               dplyr_1.1.4                
## [23] purrr_1.0.2                 readr_2.1.5                
## [25] tidyr_1.3.1                 tibble_3.2.1               
## [27] ggplot2_3.5.1               tidyverse_2.0.0            
## [29] pak_0.8.0                  
## 
## loaded via a namespace (and not attached):
##   [1] fs_1.6.4                  spatstat.sparse_3.1-0    
##   [3] enrichplot_1.24.2         httr_1.4.7               
##   [5] RColorBrewer_1.1-3        backports_1.5.0          
##   [7] tools_4.4.1               sctransform_0.4.1        
##   [9] utf8_1.2.4                R6_2.5.1                 
##  [11] lazyeval_0.2.2            uwot_0.2.2               
##  [13] withr_3.0.1               sp_2.1-4                 
##  [15] gridExtra_2.3             progressr_0.14.0         
##  [17] textshaping_0.4.0         cli_3.6.3                
##  [19] spatstat.explore_3.3-2    scatterpie_0.2.4         
##  [21] labeling_0.4.3            sass_0.4.9               
##  [23] spatstat.data_3.1-2       ggridges_0.5.6           
##  [25] pbapply_1.7-2             systemfonts_1.1.0        
##  [27] yulab.utils_0.1.7         gson_0.1.0               
##  [29] DOSE_3.30.4               R.utils_2.12.3           
##  [31] parallelly_1.38.0         limma_3.60.4             
##  [33] rstudioapi_0.16.0         RSQLite_2.3.7            
##  [35] generics_0.1.3            gridGraphics_0.5-1       
##  [37] vroom_1.6.5               ica_1.0-3                
##  [39] spatstat.random_3.3-2     GO.db_3.19.1             
##  [41] Matrix_1.7-0              ggbeeswarm_0.7.2         
##  [43] fansi_1.0.6               abind_1.4-8              
##  [45] R.methodsS3_1.8.2         lifecycle_1.0.4          
##  [47] yaml_2.3.10               qvalue_2.36.0            
##  [49] SparseArray_1.4.8         Rtsne_0.17               
##  [51] grid_4.4.1                blob_1.2.4               
##  [53] promises_1.3.0            crayon_1.5.3             
##  [55] miniUI_0.1.1.1            lattice_0.22-6           
##  [57] beachmat_2.20.0           cowplot_1.1.3            
##  [59] KEGGREST_1.44.1           pillar_1.9.0             
##  [61] knitr_1.48                fgsea_1.30.0             
##  [63] future.apply_1.11.2       codetools_0.2-20         
##  [65] fastmatch_1.1-4           leiden_0.4.3.1           
##  [67] glue_1.7.0                ggfun_0.1.6              
##  [69] spatstat.univar_3.0-1     data.table_1.16.0        
##  [71] treeio_1.28.0             vctrs_0.6.5              
##  [73] png_0.1-8                 spam_2.10-0              
##  [75] gtable_0.3.5              assertthat_0.2.1         
##  [77] cachem_1.1.0              xfun_0.47                
##  [79] S4Arrays_1.4.1            mime_0.12                
##  [81] tidygraph_1.3.1           survival_3.6-4           
##  [83] statmod_1.5.0             fitdistrplus_1.2-1       
##  [85] ROCR_1.0-11               nlme_3.1-164             
##  [87] ggtree_3.12.0             bit64_4.5.2              
##  [89] RcppAnnoy_0.0.22          bslib_0.8.0              
##  [91] irlba_2.3.5.1             vipor_0.4.7              
##  [93] KernSmooth_2.23-24        colorspace_2.1-1         
##  [95] DBI_1.2.3                 ggrastr_1.0.2            
##  [97] tidyselect_1.2.1          bit_4.5.0                
##  [99] compiler_4.4.1            httr2_1.0.4              
## [101] BiocNeighbors_1.22.0      DelayedArray_0.30.1      
## [103] plotly_4.10.4             shadowtext_0.1.4         
## [105] checkmate_2.3.2           scales_1.3.0             
## [107] lmtest_0.9-40             rappdirs_0.3.3           
## [109] digest_0.6.37             goftest_1.2-3            
## [111] spatstat.utils_3.1-0      rmarkdown_2.28           
## [113] XVector_0.44.0            htmltools_0.5.8.1        
## [115] pkgconfig_2.0.3           sparseMatrixStats_1.16.0 
## [117] highr_0.11                fastmap_1.2.0            
## [119] rlang_1.1.4               htmlwidgets_1.6.4        
## [121] UCSC.utils_1.0.0          shiny_1.9.1              
## [123] DelayedMatrixStats_1.26.0 farver_2.1.2             
## [125] jquerylib_0.1.4           zoo_1.8-12               
## [127] jsonlite_1.8.9            BiocParallel_1.38.0      
## [129] GOSemSim_2.30.2           R.oo_1.26.0              
## [131] BiocSingular_1.20.0       magrittr_2.0.3           
## [133] GenomeInfoDbData_1.2.12   ggplotify_0.1.2          
## [135] dotCall64_1.1-1           patchwork_1.3.0          
## [137] munsell_0.5.1             Rcpp_1.0.13              
## [139] ape_5.8                   viridis_0.6.5            
## [141] reticulate_1.39.0         stringi_1.8.4            
## [143] ggraph_2.2.1              zlibbioc_1.50.0          
## [145] MASS_7.3-60.2             plyr_1.8.9               
## [147] parallel_4.4.1            listenv_0.9.1            
## [149] ggrepel_0.9.6             deldir_2.0-4             
## [151] Biostrings_2.72.1         graphlayouts_1.2.0       
## [153] splines_4.4.1             tensor_1.5               
## [155] hms_1.1.3                 clustree_0.5.1           
## [157] igraph_2.0.3              spatstat.geom_3.3-3      
## [159] reshape2_1.4.4            ScaledMatrix_1.12.0      
## [161] evaluate_1.0.0            tzdb_0.4.0               
## [163] tweenr_2.0.3              httpuv_1.6.15            
## [165] RANN_2.6.2                polyclip_1.10-7          
## [167] future_1.34.0             scattermore_1.2          
## [169] ggforce_0.4.2             rsvd_1.0.5               
## [171] xtable_1.8-4              tidytree_0.4.6           
## [173] later_1.3.2               ragg_1.3.3               
## [175] viridisLite_0.4.2         aplot_0.2.3              
## [177] memoise_2.0.1             beeswarm_0.4.0           
## [179] cluster_2.1.6             timechange_0.3.0         
## [181] globals_0.16.3